simSummary = data.frame(realData = c(rep('Allen', 3),rep('Zeisel',3)),
K = rep(2, 6),
nclusters = rep(3, 6),
ncells = rep('100', 6),
zeroFract = rep(c(.25, .5, .75),2))
kable(simSummary)
| realData | K | nclusters | ncells | zeroFract |
|---|---|---|---|---|
| Allen | 2 | 3 | 100 | 0.25 |
| Allen | 2 | 3 | 100 | 0.50 |
| Allen | 2 | 3 | 100 | 0.75 |
| Zeisel | 2 | 3 | 100 | 0.25 |
| Zeisel | 2 | 3 | 100 | 0.50 |
| Zeisel | 2 | 3 | 100 | 0.75 |
Steps are as follow
1. Filter out the genes that do not have at least 5 counts in at least 5 cells.
2. Sample at random 1000 genes.
3. Fit zinb to get \(W\), \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\).
4. Simulate \(W\) from multivariate gaussians fitted using mclust package. It becomes the truth W. Keeping the total variance constant, change the variance between and within the clusters.
5. Simulate \(\gamma_{\pi}\) and \(\gamma_{\mu}\) from bivariate gaussians. They become the truth \(\gamma_{\pi}\) and \(\gamma_{\mu}\). Change mean of \(\gamma_{\pi}\) to get different zero fractions.
6. Create zinb model = zinbModel(simW, \(\beta_{\mu}\),\(\beta_{\pi}\), simGammaMu, simGammaPi, \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\zeta\)).
7. Simulate counts nrep times using zinbSim(model).
Using this procedure, we have the simulated counts and true \(W\), \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\) where the paramters are
Throughout, we keep constant
data("allen")
prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
filterGenes <- apply(assay(prefilter) > 5, 1, sum) >= 5
postfilter <- prefilter[filterGenes, ]
core <- assay(postfilter)
#vars <- rowVars(log1p(core))
#names(vars) <- rownames(core)
#vars <- sort(vars, decreasing = TRUE)
#core <- core[names(vars)[1:1000],]
core = core[sample(1:nrow(core), 1000),]
bioIni = as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
pZero <- rowMeans(log1p(core) == 0)
names(pZero) <- rownames(core)
#sum(core == 0)/(nrow(core) * ncol(core))
We use Allen dataset with all cells and 1000 randomly selected genes. The dimensions are
dim(core)
## [1] 1000 285
Colors in left panel correspond to the area of the brain the cells were sampled from. Colors in right panel correspond to the inferred clusters in Allen paper.
par(mfrow = c(1,2))
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bioIni], pch=19, main="PCA of log-counts")
#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts")
Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
commondispersion = FALSE)))
## user system elapsed
## 845.772 534.673 534.658
Check correlations between \(W\), \(\gamma_{mu}\), \(\gamma_{pi}\), detection rate, and coverage.
W is not correlated with \(\gamma_{\mu}\) and \(\gamma_{\pi}\).
\(\gamma_{\mu}\) and \(\gamma_{\pi}\) are correlated.
\(\gamma_{\mu}\) is correlated with coverage.
\(\gamma_{\pi}\) is correlated with detection rate.
In what follow, we assume that \(W\) and \(\gamma\) are independent. We simulate \(W\) and \(\gamma\) separetely from bivariate gaussians. For W, we fit a mixture of gaussians specifying the number of clusters (K = 3 here).
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
gamma_mu = zinb@gamma_mu[1, ],
gamma_pi = zinb@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bioIni])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.000000000 -0.13676590 0.1572193 -0.008460554
## W2 -0.136765903 1.00000000 0.2258026 -0.349625986
## gamma_mu 0.157219282 0.22580259 1.0000000 -0.454762139
## gamma_pi -0.008460554 -0.34962599 -0.4547621 1.000000000
## detection_rate -0.157167024 0.38536219 0.5521462 -0.948360810
## coverage -0.340998512 0.09416144 0.7920812 -0.365406128
## detection_rate coverage
## W1 -0.1571670 -0.34099851
## W2 0.3853622 0.09416144
## gamma_mu 0.5521462 0.79208116
## gamma_pi -0.9483608 -0.36540613
## detection_rate 1.0000000 0.52694205
## coverage 0.5269421 1.00000000
Additionally, let’s check \(\alpha\) and \(\beta\).
df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
alpha_mu_2 = zinb@alpha_mu[2, ],
alpha_pi_1 = zinb@alpha_pi[1, ],
alpha_pi_2 = zinb@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 -0.10838160 -0.05057317 0.05214476
## alpha_mu_2 -0.10838160 1.00000000 0.12737187 -0.01344765
## alpha_pi_1 -0.05057317 0.12737187 1.00000000 0.01138462
## alpha_pi_2 0.05214476 -0.01344765 0.01138462 1.00000000
df <- data.frame(beta_mu = zinb@beta_mu[1, ],
beta_pi = zinb@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.000000 -0.385929
## beta_pi -0.385929 1.000000
Pick number of cells to simulate. Here
ncells = 100
ncells
## [1] 100
Pick number of clusters to simulate. Here
nclust = 3
nclust
## [1] 3
We simulate \(W\) from three bivariate gaussians.
mar <- par("mar")
mar[c(2, 4)] <- 0
par(mfrow = c(1,3), mar = mar)
# zinbW
xlim = c(min(zinb@W[,1]) - 1, max(zinb@W[,1]) + 1)
ylim = c(min(zinb@W[,2]) - 1, max(zinb@W[,2]) + 1)
plot(zinb@W, col = cols[bioIni], xlim = xlim, ylim = ylim,
main = 'zinb fitted W\ncolor = brain area')
# mclustW
mclustW = Mclust(zinb@W, G = nclust)
plot(mclustW$data, main = 'multivar gauss fit\nmclust K = 3', xlab = 'W1', ylab = 'W2',
xlim = xlim, ylim = ylim, col = mclustW$classification)
# multivar gaussian
clust = sample(mclustW$classification, ncells, replace = TRUE)
simW1 = lapply(clust, function(i){
mvrnorm(n = 1, mu = mclustW$parameters$mean[, i],
Sigma = mclustW$parameters$variance$sigma[,, i])
})
simW1 = do.call(rbind, simW1)
bio = clust
plot(simW1, col = bio, main = 'multivar gauss sim\ntrue W\n100 cells',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
par(mfrow = c(1, 1))
\(N\) = total number of cells.
\(K\) = number of cluters. Here K = 3.
\(n_k\) = number of cells classified in cluster \(k\).
\(\bar{W}\) = mean of W.
\(\bar{W_k}\) = mean of W for cells in cluster \(k\).
\[V_{TOTAL} = \frac{1}{N-1} \sum_{k=1}^K \sum_{i=1}^{n_k} (W_{ik} - \bar{W})^2\] \[V_{TOTAL} = \frac{1}{N-1} (\sum_{k=1}^K \sum_{i=1}^{n_k} (W_{ik} - \bar{W_k})^2 + \sum_{k=1}^K n_k (\bar{W_k} - \bar{W})^2)\] \[V_{TOTAL} = \frac{1}{N-1} (S_{INTRA} + \sum_{k=1}^K n_k (\bar{W_k} - \bar{W})^2)\]
with \(S_{INTRA} = \sum_{k=1}^K \sum_{i=1}^{n_k} (W_{ik} - \bar{W_k})^2\).
We want to keep the total variance constant so that the values of W stay in the same domain as the domain of the true values of W. Keeping the total variance constant, we want to vary the within and between clusters variance so that we have a way to ease the problem or make it harder. Let’s replace \(\bar{W_k}\) -> \(a \bar{W_k}\) and \(S_{INTRA}\) -> \(b^2 S_{INTRA}\).
Then, \[V_{TOTAL} = \frac{1}{N-1} (b^2 S_{INTRA} + \sum_{k=1}^K n_k (a \bar{W_k} - \bar{W})^2)\]
\[ b^2 = \frac{1}{S_{INTRA}} ((N-1)V_{TOTAL} - \sum_{k=1}^K n_k (a \bar{W_k} - \bar{W})^2)\]
computeB2 = function(mclustW, a = 1){
N = nrow(mclustW$data)
Vtot = apply(mclustW$data, 2, var)
Vinter = (a * mclustW$parameters$mean - apply(mclustW$data, 2, mean))^2
nk = as.vector(table(mclustW$classification))
Vinter = as.vector(Vinter %*% nk)
mk = sapply(mclustW$classification, function(i) mclustW$parameters$mean[,i])
Vintra = apply((mclustW$data - t(mk))^2, 2, sum)
( 1 / Vintra ) * ( (N-1) * Vtot - Vinter )
}
a = seq(-1.5, 1.5, 0.01)
ylim = c(-1, 5)
b2 = sapply(a, function(x) computeB2(mclustW, x))
par(mfrow = c(1,2))
plot(a, t(b2)[,1], type = 'l', ylab = 'b2_1', xlab = 'a_1', ylim=ylim)
abline(h = 0, col= 'red')
abline(h = 1, col= 'green')
rect(a[1], ylim[1], a[length(a)], 0, density = 10,
col = 'red', border = "transparent", lty = par("lty"), lwd = par("lwd"))
plot(a, t(b2)[,2], type = 'l', ylab = 'b2_2', xlab = 'a_2', ylim=ylim)
abline(h = 0, col= 'red')
abline(h = 1, col= 'green')
rect(a[1], ylim[1], a[length(a)], 0, density = 10,
col = 'red', border = "transparent", lty = par("lty"), lwd = par("lwd"))
Check that code works for a = 0 and b = 0.
mar <- par("mar")
mar[c(2, 4)] <- 0
par(mfrow = c(1,3), mar = mar)
# zinbW
xlim = c(min(zinb@W[,1]) - 1, max(zinb@W[,1]) + 1)
ylim = c(min(zinb@W[,2]) - 1, max(zinb@W[,2]) + 1)
plot(simW1, col = bio, main = 'multivar gauss sim\nb2=1',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
a = c(0, 0)
b2 = computeB2(mclustW, a)
b2 = matrix(c(b2[1], 1, 1, b2[2]), ncol = 2, nrow=2)
simW = lapply(clust, function(i){
mvrnorm(n = 1, mu = mclustW$parameters$mean[, i] * a,
Sigma = mclustW$parameters$variance$sigma[,, i] * b2)
})
simW = do.call(rbind, simW)
plot(simW, col = bio, main = 'multivar gauss sim\nb2 big',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
a = c(1.07, 1.137)
b2 = computeB2(mclustW, a)
b2 = matrix(c(b2[1], 1, 1, b2[2]), ncol = 2, nrow=2)
simW = lapply(clust, function(i){
mvrnorm(n = 1, mu = mclustW$parameters$mean[, i] * a,
Sigma = mclustW$parameters$variance$sigma[,, i] * b2)
})
simW = do.call(rbind, simW)
plot(simW, col = bio, main = 'multivar gauss sim\nb2 small',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
par(mfrow = c(1,2))
# zinbW
plot(simW1, col = bio,
main = 'multivar gauss sim\ntrue W, 100 cells\nb2=1',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
a = c(.85, .7)
b2 = computeB2(mclustW, a)
b2 = matrix(c(b2[1], 1, 1, b2[2]), ncol = 2, nrow=2)
simW2 = lapply(clust, function(i){
mvrnorm(n = 1, mu = mclustW$parameters$mean[, i] * a,
Sigma = mclustW$parameters$variance$sigma[,, i] * b2)
})
simW2 = do.call(rbind, simW2)
plot(simW2, col = bio, main = 'multivar gauss sim\ntrue W, 100 cells\nb2=2',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
simW = list(simW1, simW2)
We fit a bivariate gaussian to \(\gamma_{\pi}\) and \(\gamma_{\mu}\) and use the fitted means and covariance matrix to simulate \(\gamma_{\pi}\) and \(\gamma_{\mu}\).
gamma = data.frame(gammaMu = zinb@gamma_mu[1, ],
gammaPi = zinb@gamma_pi[1, ])
mar <- par("mar")
mar[c(2, 4)] <- 0
par(mfrow = c(1,2), mar = mar)
# zinbGamma
xlim = c(min(gamma[,1]) - .5, max(gamma[,1]) + .5)
ylim = c(min(gamma[,2]) - .5, max(gamma[,2]) + .5)
plot(gamma[,1], gamma[,2], col = cols[bioIni],
xlim = xlim, ylim = ylim, xlab = 'gamma_mu', ylab = 'gamma_pi',
main = 'zinb fitted Gamma\ncolor = brain area')
# mclustW
mclustGamma = Mclust(gamma, G = 1)
# multivar gaussian
simGamma = mvrnorm(n = ncells, mu = mclustGamma$parameters$mean[,1],
Sigma = mclustGamma$parameters$variance$sigma[,, 1])
plot(simGamma, col = bio, main = 'bivar gauss sim\ntrue Gamma\n100 cells',
xlab = 'gamma_mu', ylab = 'gamma_pi', xlim = xlim, ylim = ylim)
par(mfrow = c(1, 1))
Let’s simulate counts from the simulated \(W\), \(\gamma_{\mu}\), and \(\gamma_{\pi}\). Zero fraction of the simulated counts is
mod = zinbModel(W=simW[[1]], gamma_mu = matrix(simGamma[,1], nrow = 1),
gamma_pi = matrix(simGamma[,2], nrow = 1),
alpha_mu=zinb@alpha_mu, alpha_pi=zinb@alpha_pi,
beta_mu=zinb@beta_mu, beta_pi=zinb@beta_pi, zeta = zinb@zeta)
sim = zinbSim(mod, seed = 8746)
core = t(sim$counts)
sum(core == 0)/(nrow(core) * ncol(core))
## [1] 0.46328
Note that correlation smaller than for real data, I think because \(\gamma_{\mu}\) and \(\gamma_{\pi}\) are not really gaussians.
df <- data.frame(gamma_mu = mod@gamma_mu[1, ],
gamma_pi = mod@gamma_pi[1, ])
pairs(df, pch=19, col = clust, main = 'Beta')
print(cor(df, method="spearman"))
## gamma_mu gamma_pi
## gamma_mu 1.0000000 -0.3442184
## gamma_pi -0.3442184 1.0000000
We want three different zero fractions (25%, 45%, 75%). A zero fraction of 45% corresponds to the real dataset zero fraction, \(\gamma_{\pi}\) and \(\gamma_{\mu}\) are simulated as described above. To change the zero fraction to 25% and 75%, we simulate \(\gamma_{\pi}\) and \(\gamma_{\mu}\) as before but the mean of \(\gamma_{\pi}\) is changed. We keep the mean of \(\gamma_{\mu}\) and the covariance matrix constant.
gammaOffset = c('25' = -3.5, '45' = 0, '75' = 3.6)
simGamma = lapply(gammaOffset, function(i){
mvrnorm(n = ncells, mu = mclustGamma$parameters$mean[,1] + c(0, i),
Sigma = mclustGamma$parameters$variance$sigma[,, 1])
})
Again, for the three different zero fraction, correlation is smaller than for real data.
for (i in 1:3){
df <- data.frame(gamma_mu = simGamma[[i]][, 1],
gamma_pi = simGamma[[i]][, 2])
pairs(df, pch=19, col = clust, main = 'Beta')
print(cor(df, method="spearman"))
}
## gamma_mu gamma_pi
## gamma_mu 1.0000000 -0.2579898
## gamma_pi -0.2579898 1.0000000
## gamma_mu gamma_pi
## gamma_mu 1.0000000 -0.1479988
## gamma_pi -0.1479988 1.0000000
## gamma_mu gamma_pi
## gamma_mu 1.0000000 -0.2717312
## gamma_pi -0.2717312 1.0000000
models = lapply(1:2, function(k){
lapply(1:3, function(i){
zinbModel(W=simW[[k]], gamma_mu = matrix(simGamma[[i]][,1], nrow = 1),
gamma_pi = matrix(simGamma[[i]][,2], nrow = 1),
alpha_mu=zinb@alpha_mu, alpha_pi=zinb@alpha_pi,
beta_mu=zinb@beta_mu, beta_pi=zinb@beta_pi, zeta = zinb@zeta)
})
})
pal = colorRampPalette(c("black","black", "red","yellow"), space="rgb")
par(mfrow=c(2, 3))
for (k in 1:2){
for (i in 1:3){
smoothScatter(colMeans(log1p(getMu(models[[k]][[i]]))),
colMeans(getPi(models[[k]][[i]])), nbin = 256,
nrpoints = Inf, pch = "", cex = .7, xlim = c(0,15),
xlab = "Average simulated log Mu",
main = paste0('var',k,'_z',names(gammaOffset)[i]),
ylab = "Average simulated Pi",ylim = c(0,1),
colramp = pal)
}
}
B = 10
bio = clust
sim = lapply(1:2, function(k){
lapply(1:3, function(i){
simModel = models[[k]][[i]]
simData = lapply(seq_len(B), function(j){
zinbSim(simModel, seed = j*i*k)
})
fileName = sprintf("./datasets/simAllen_var%s_z%s.rda",
k, names(gammaOffset[i]))
save(bio, simModel, simData, file = fileName)
sapply(simData, function(x) x$zeroFraction)
#sapply(simData, function(x) sum(colSums(x$counts) < 5))
})
})
zeroFrac = data.frame(do.call(cbind, do.call(cbind, sim)), stringsAsFactors = F)
colnames(zeroFrac) = c(paste0('var1_z', names(gammaOffset)),
paste0('var2_z', names(gammaOffset)))
ggplot(melt(zeroFrac), aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + xlab('zero fraction') +
theme_bw() + ylab('simulated zero fraction')
Let’s look at one simulated dataset for each zero fraction.
par(mfrow=c(2, 3))
for (k in 1:2){
for (i in 1:3){
fileName = sprintf("./datasets/simAllen_var%s_z%s.rda",
k, names(gammaOffset[i]))
load(fileName)
counts = simData[[1]]$counts
zero_rate <- rowMeans(counts == 0)
plot(rowMeans(getPi(models[[k]][[i]])), zero_rate,
main = paste0('var', k, '_z', names(gammaOffset)[i]),
xlab="Average simulated Pi", ylab="Simulated Zero Rate",
pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
abline(a=0,b=1,col='gray')
}
}
par(mfrow=c(2, 3))
for (k in 1:2){
for (i in 1:3){
fileName = sprintf("./datasets/simAllen_var%s_z%s.rda",
k, names(gammaOffset[i]))
load(fileName)
counts = simData[[1]]$counts
coverage <- rowMeans(log1p(counts))
plot(rowMeans(log1p(getMu(models[[k]][[i]]))), coverage,
xlim = c(3, 7),
xlab="Average simulated log Mu", ylab="Average log counts", pch=19,
col=bio,ylim = c(0,6),
main = paste0('var', k, '_z', names(gammaOffset)[i]))
abline(a=0,b=1,col='gray')
}
}